Max Error (max_error)#
max_error (also called maximum absolute error) measures the single worst absolute mistake a regression model makes on a dataset.
Goals
Define
max_errorprecisely (and connect it to the \(\ell_\infty\) norm)Visualize what “worst-case error” means
Implement the metric from scratch in NumPy
Compare it to MAE and RMSE
Use a smooth surrogate to optimize a simple linear regression model for worst-case error
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import os
import plotly.io as pio
from plotly.subplots import make_subplots
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error
pio.templates.default = "plotly_white"
pio.renderers.default = os.environ.get("PLOTLY_RENDERER", "notebook")
np.set_printoptions(precision=4, suppress=True)
rng = np.random.default_rng(42)
1) Definition#
Let \(y \in \mathbb{R}^n\) be targets and \(\hat{y} \in \mathbb{R}^n\) predictions. Define residuals \(r_i = y_i - \hat{y}_i\).
The max error is the maximum absolute residual:
Best possible value: \(0\) (perfect predictions)
Units: same as the target \(y\)
Important behavior: only the worst point matters
Relation to MAE / RMSE#
If we define absolute errors \(e_i = |y_i-\hat{y}_i| \ge 0\), then:
So, for the same residuals: MaxError \(\ge\) RMSE \(\ge\) MAE.
y_true = np.array([2.0, 0.0, 4.0, 1.0, 3.0])
y_pred = np.array([1.7, 0.2, 3.9, 2.3, 2.8])
residual = y_true - y_pred
abs_error = np.abs(residual)
max_err_np = float(np.max(abs_error))
max_err_skl = float(max_error(y_true, y_pred))
print("residuals:", residual)
print("|residuals|:", abs_error)
print(f"max_error (NumPy): {max_err_np:.4f}")
print(f"max_error (sklearn): {max_err_skl:.4f}")
residuals: [ 0.3 -0.2 0.1 -1.3 0.2]
|residuals|: [0.3 0.2 0.1 1.3 0.2]
max_error (NumPy): 1.3000
max_error (sklearn): 1.3000
idx = np.arange(len(y_true))
max_idx = int(np.argmax(abs_error))
lo = min(y_true.min(), y_pred.min()) - 0.5
hi = max(y_true.max(), y_pred.max()) + 0.5
line = np.linspace(lo, hi, 200)
colors = np.array(["#1f77b4"] * len(y_true), dtype=object)
colors[max_idx] = "crimson"
fig = make_subplots(
rows=1,
cols=2,
subplot_titles=("Predictions vs targets", "Absolute errors |y-ŷ|"),
)
fig.add_trace(
go.Scatter(
x=y_true,
y=y_pred,
mode="markers",
name="samples",
marker=dict(size=10, color=colors),
customdata=np.column_stack([idx, abs_error]),
hovertemplate="i=%{customdata[0]}<br>y=%{x:.2f}<br>ŷ=%{y:.2f}<br>|e|=%{customdata[1]:.2f}<extra></extra>",
),
row=1,
col=1,
)
fig.add_trace(
go.Scatter(
x=line,
y=line,
mode="lines",
name="y = ŷ",
line=dict(color="gray", dash="dash"),
),
row=1,
col=1,
)
fig.add_trace(
go.Scatter(
x=[y_true[max_idx]],
y=[y_pred[max_idx]],
mode="markers+text",
name="worst point",
marker=dict(size=14, color="crimson", symbol="x"),
text=["max"],
textposition="top center",
showlegend=False,
),
row=1,
col=1,
)
fig.add_trace(
go.Bar(
x=idx,
y=abs_error,
marker_color=colors,
name="|error|",
hovertemplate="i=%{x}<br>|e|=%{y:.3f}<extra></extra>",
),
row=1,
col=2,
)
fig.add_trace(
go.Scatter(
x=[max_idx],
y=[abs_error[max_idx]],
mode="markers+text",
marker=dict(size=10, color="crimson"),
text=[f"max = {abs_error[max_idx]:.2f}"],
textposition="top center",
showlegend=False,
hoverinfo="skip",
),
row=1,
col=2,
)
fig.update_xaxes(title_text="y", row=1, col=1)
fig.update_yaxes(title_text="ŷ", row=1, col=1)
fig.update_xaxes(title_text="sample index i", row=1, col=2)
fig.update_yaxes(title_text="|y - ŷ|", row=1, col=2)
fig.update_layout(title=f"max_error = {max_err_np:.3f}")
fig.show()
2) Intuition: why max_error behaves differently#
Unlike MAE/RMSE (which average over points), max_error is a minimax statistic:
If you improve many non-worst points but the worst point stays the same,
max_errordoes not change.If a single point gets worse and becomes the new maximum,
max_errorjumps.
A good way to see this is to vary the prediction for one sample while keeping all others fixed.
n = 60
y_true = rng.normal(0, 1.0, size=n)
y_pred_base = y_true + rng.normal(0, 0.6, size=n)
j = 0 # we'll perturb this one prediction
deltas = np.linspace(-6, 6, 400)
max_vals, mae_vals, rmse_vals = [], [], []
for d in deltas:
y_pred = y_pred_base.copy()
y_pred[j] += d
r = y_true - y_pred
e = np.abs(r)
max_vals.append(float(np.max(e)))
mae_vals.append(float(np.mean(e)))
rmse_vals.append(float(np.sqrt(np.mean(r**2))))
fig = go.Figure()
fig.add_trace(go.Scatter(x=deltas, y=max_vals, mode="lines", name="max_error"))
fig.add_trace(go.Scatter(x=deltas, y=mae_vals, mode="lines", name="MAE"))
fig.add_trace(go.Scatter(x=deltas, y=rmse_vals, mode="lines", name="RMSE"))
fig.update_layout(
title="Perturbing one prediction: max_error vs MAE vs RMSE",
xaxis_title="delta added to one prediction",
yaxis_title="metric value",
)
fig.show()
3) max_error from scratch (NumPy)#
In scikit-learn, max_error is defined for single-output regression (multioutput is not supported).
Below is a minimal NumPy implementation with similar shape checks.
def max_error_np(y_true, y_pred):
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
if y_true.ndim != 1 or y_pred.ndim != 1:
raise ValueError("Multioutput not supported in max_error")
if y_true.shape != y_pred.shape:
raise ValueError(f"Shape mismatch: {y_true.shape} vs {y_pred.shape}")
return float(np.max(np.abs(y_true - y_pred)))
# quick check
for _ in range(3):
yt = rng.normal(size=20)
yp = yt + rng.normal(0, 0.5, size=20)
assert np.isclose(max_error_np(yt, yp), max_error(yt, yp))
print("ok: max_error_np matches sklearn on random tests")
ok: max_error_np matches sklearn on random tests
4) Example: one bad outlier dominates#
Because max_error is a worst-case statistic, a single extreme residual can dominate the score.
This can be a feature (if the worst-case matters) or a bug (if the worst-case is just noise).
n = 40
y_true = rng.normal(0, 1.0, size=n)
y_pred = y_true + rng.normal(0, 0.3, size=n)
# Inject one huge mistake
k = 7
y_pred[k] += 6.0
r = y_true - y_pred
e = np.abs(r)
mae = float(np.mean(e))
rmse = float(np.sqrt(np.mean(r**2)))
mx = float(np.max(e))
print(f"MAE = {mae:.3f}")
print(f"RMSE = {rmse:.3f}")
print(f"max_error = {mx:.3f}")
colors = np.array(["#1f77b4"] * n, dtype=object)
colors[int(np.argmax(e))] = "crimson"
fig = px.bar(
x=np.arange(n),
y=e,
title="Absolute errors with a single outlier",
labels={"x": "sample index i", "y": "|y - ŷ|"},
)
fig.update_traces(marker_color=colors)
fig.add_hline(y=mx, line_dash="dash", line_color="crimson", annotation_text=f"max = {mx:.2f}")
fig.show()
MAE = 0.382
RMSE = 0.996
max_error = 6.016
5) Using max_error as an optimization objective (minimax regression)#
For a model \(\hat{y}_i = f(x_i;\theta)\), minimizing max error means:
For a line \(\hat{y}_i = b_0 + b_1 x_i\) this is a classic problem: \(\ell_\infty\) (Chebyshev) regression.
Linear-programming form (convex)#
Introduce a slack variable \(t \ge 0\) representing the maximum absolute residual:
This is convex, but the max/absolute value makes it non-smooth, so plain gradient descent on the exact objective is awkward.
Smooth surrogate (for gradient descent)#
We can build a differentiable approximation in two steps:
Smooth absolute value
Smooth max via log-sum-exp
With \(r_i = y_i - (b_0 + b_1 x_i)\) and \(z_i = \sqrt{r_i^2+\varepsilon}\):
As \(\alpha \to \infty\) and \(\varepsilon \to 0\), \(\tilde{J}\) approaches the true max error.
Gradients#
Let
Then:
Interpretation: the gradient becomes a weighted combination of signs, and with large \(\alpha\) the weights concentrate on the current worst-error points.
# Synthetic data with a couple of outliers
n = 140
x = rng.uniform(-3, 3, size=n)
b0_true = 1.5
b1_true = 2.0
noise = rng.normal(0, 0.8, size=n)
y = b0_true + b1_true * x + noise
outlier_idx = rng.choice(n, size=2, replace=False)
y[outlier_idx] += rng.normal(0, 6.0, size=2)
fig = px.scatter(x=x, y=y, title="Synthetic regression data (with a couple outliers)", labels={"x": "x", "y": "y"})
fig.add_trace(
go.Scatter(
x=x[outlier_idx],
y=y[outlier_idx],
mode="markers",
marker=dict(size=12, color="crimson", symbol="x"),
name="outliers",
)
)
fig.show()
def smooth_abs(r, eps=1e-6):
return np.sqrt(r * r + eps)
def smooth_max(z, alpha=30.0):
# stable log-sum-exp
z_scaled = alpha * z
z_max = np.max(z_scaled)
return float((z_max + np.log(np.sum(np.exp(z_scaled - z_max)))) / alpha)
def smooth_max_weights(z, alpha=30.0):
z_scaled = alpha * z
z_max = np.max(z_scaled)
w = np.exp(z_scaled - z_max)
w /= np.sum(w)
return w
def smooth_max_error_for_line(x, y, b0, b1, *, alpha=30.0, eps=1e-6):
y_hat = b0 + b1 * x
r = y - y_hat
e = smooth_abs(r, eps=eps)
return smooth_max(e, alpha=alpha)
def smooth_max_error_gradients_for_line(x, y, b0, b1, *, alpha=30.0, eps=1e-6):
y_hat = b0 + b1 * x
r = y - y_hat
e = smooth_abs(r, eps=eps)
w = smooth_max_weights(e, alpha=alpha)
obj = smooth_max(e, alpha=alpha)
# d obj / d r_i = w_i * (r_i / e_i)
d_obj_dr = w * (r / e)
db0 = float(-np.sum(d_obj_dr))
db1 = float(-np.sum(d_obj_dr * x))
return float(obj), db0, db1, w
def fit_line_minimax_gd(x, y, *, alpha=30.0, eps=1e-6, lr=0.1, n_steps=2000):
b0, b1 = 0.0, 0.0
history = {
"step": [],
"b0": [],
"b1": [],
"smooth_obj": [],
"max_error": [],
}
for step in range(n_steps):
obj, db0, db1, _ = smooth_max_error_gradients_for_line(x, y, b0, b1, alpha=alpha, eps=eps)
if step % 10 == 0 or step == n_steps - 1:
y_hat = b0 + b1 * x
history["step"].append(step)
history["b0"].append(b0)
history["b1"].append(b1)
history["smooth_obj"].append(obj)
history["max_error"].append(max_error_np(y, y_hat))
b0 -= lr * db0
b1 -= lr * db1
return (b0, b1), history
# OLS fit (minimizes MSE)
X = np.column_stack([np.ones_like(x), x])
b0_ols, b1_ols = np.linalg.lstsq(X, y, rcond=None)[0]
# Smooth-minimax fit (approximate max_error minimization)
(alpha, lr, n_steps) = (40.0, 0.1, 2000)
(b0_mm, b1_mm), hist = fit_line_minimax_gd(x, y, alpha=alpha, lr=lr, n_steps=n_steps)
# Compare metrics on the training set
def summarize_fit(name, b0, b1):
y_hat = b0 + b1 * x
r = y - y_hat
mae = float(np.mean(np.abs(r)))
rmse = float(np.sqrt(np.mean(r**2)))
mx = max_error_np(y, y_hat)
return {
"name": name,
"b0": float(b0),
"b1": float(b1),
"MAE": mae,
"RMSE": rmse,
"max_error": mx,
}
rows = [
summarize_fit("OLS (MSE)", b0_ols, b1_ols),
summarize_fit(f"Smooth-minimax (alpha={alpha:g})", b0_mm, b1_mm),
]
rows
[{'name': 'OLS (MSE)',
'b0': 1.34697546337952,
'b1': 1.9970499860377036,
'MAE': 0.7826401886556796,
'RMSE': 1.4007587709322311,
'max_error': 10.424859636373611},
{'name': 'Smooth-minimax (alpha=40)',
'b0': -2.8094854370763436,
'b1': 1.9269440646911562,
'MAE': 4.317537398451267,
'RMSE': 4.397525580220899,
'max_error': 6.45912538286567}]
fig = make_subplots(
rows=1,
cols=2,
subplot_titles=("Smooth surrogate objective", "Actual max_error"),
)
fig.add_trace(go.Scatter(x=hist["step"], y=hist["smooth_obj"], mode="lines", name="smooth obj"), row=1, col=1)
fig.add_trace(go.Scatter(x=hist["step"], y=hist["max_error"], mode="lines", name="max_error"), row=1, col=2)
fig.update_xaxes(title_text="step", row=1, col=1)
fig.update_yaxes(title_text="value", row=1, col=1)
fig.update_xaxes(title_text="step", row=1, col=2)
fig.update_yaxes(title_text="value", row=1, col=2)
fig.update_layout(title="Training progress")
fig.show()
# Plot fits + their worst-case bands
x_line = np.linspace(x.min(), x.max(), 250)
def yhat_line(b0, b1):
return b0 + b1 * x_line
# Compute max_error (vertical band half-width) on the training points
mx_ols = max_error_np(y, b0_ols + b1_ols * x)
mx_mm = max_error_np(y, b0_mm + b1_mm * x)
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode="markers", name="data", marker=dict(size=6, opacity=0.8)))
# OLS line + band
y_line_ols = yhat_line(b0_ols, b1_ols)
fig.add_trace(go.Scatter(x=x_line, y=y_line_ols, mode="lines", name=f"OLS line (max={mx_ols:.2f})", line=dict(color="#1f77b4")))
fig.add_trace(
go.Scatter(
x=np.concatenate([x_line, x_line[::-1]]),
y=np.concatenate([y_line_ols + mx_ols, (y_line_ols - mx_ols)[::-1]]),
fill="toself",
fillcolor="rgba(31, 119, 180, 0.12)",
line=dict(color="rgba(31, 119, 180, 0)"),
name="OLS band",
hoverinfo="skip",
showlegend=False,
)
)
# Minimax line + band
y_line_mm = yhat_line(b0_mm, b1_mm)
fig.add_trace(go.Scatter(x=x_line, y=y_line_mm, mode="lines", name=f"Smooth-minimax line (max={mx_mm:.2f})", line=dict(color="crimson")))
fig.add_trace(
go.Scatter(
x=np.concatenate([x_line, x_line[::-1]]),
y=np.concatenate([y_line_mm + mx_mm, (y_line_mm - mx_mm)[::-1]]),
fill="toself",
fillcolor="rgba(220, 20, 60, 0.12)",
line=dict(color="rgba(220, 20, 60, 0)"),
name="Minimax band",
hoverinfo="skip",
showlegend=False,
)
)
fig.update_layout(
title="OLS vs minimax regression: max-error bands",
xaxis_title="x",
yaxis_title="y",
)
fig.show()
# Visualize how the log-sum-exp weights concentrate on large errors
def weights_for_params(b0, b1, *, alpha=40.0, eps=1e-6):
r = y - (b0 + b1 * x)
e = smooth_abs(r, eps=eps)
w = smooth_max_weights(e, alpha=alpha)
return e, w
# use the minimax solution
errors, weights = weights_for_params(b0_mm, b1_mm, alpha=alpha)
df = {
"abs_error": errors,
"weight": weights,
}
fig = px.scatter(
df,
x="abs_error",
y="weight",
title=f"Softmax weights over errors (alpha={alpha:g})",
)
fig.update_layout(xaxis_title="|error| (smoothed)", yaxis_title="weight (sums to 1)")
fig.show()
6) Pros, cons, and when to use max_error#
Pros#
Direct worst-case control: answers “what is the largest absolute miss?”
Interpretable: same units as \(y\) and corresponds to a tight error band
Useful for constraints / SLAs: when you must keep every error under a threshold
Cons / pitfalls#
Dominated by one point: improving many points doesn’t matter if the worst stays unchanged
Outlier sensitive: a single noisy label can dictate the score
Non-smooth objective: if you try to optimize it directly, gradients are unstable / undefined at ties and at zero
May trade average accuracy for worst-case (as seen in the minimax vs OLS example)
Good use cases#
Safety- or reliability-critical regression where worst-case error matters more than average error
Systems with explicit tolerances (calibration, manufacturing, control)
Monitoring / evaluation alongside MAE/RMSE to catch “rare but disastrous” failures
7) Exercises#
Replace the log-sum-exp max with a \(p\)-norm approximation: \(\|e\|_p = (\sum_i e_i^p)^{1/p}\) and study the limit \(p\to\infty\).
Solve the exact minimax regression via linear programming and compare to the smooth approximation.
Evaluate
max_errorper group (e.g., slices of your data) and compare worst-group vs overall worst-case.
References#
scikit-learn:
sklearn.metrics.max_errorChebyshev (minimax) approximation / \(\ell_\infty\) regression
Log-sum-exp as a smooth approximation to max